Correlation between functional selections on humanDAG1 and mastomysDAG1 cells¶

Compare functional scores measured on cells expressing humanDAG1 and mastomysDAG1

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import altair as alt

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

File paths for data:

In [2]:
# this cell is tagged as `parameters` for papermill parameterization
HEK293T_data_path = None
humanDAG1_data_path = None
mastomysDAG1_data_path = None
# Minimum times seen filter
MTS = None 
n_selections = None

html_dir = None
html_output = None
In [3]:
# Parameters
HEK293T_data_path = "results/func_effects/averages/293T_entry_func_effects.csv"
humanDAG1_data_path = "results/func_effects/averages/human_293T_entry_func_effects.csv"
mastomysDAG1_data_path = (
    "results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
)
MTS = 2
n_selections = 8
html_dir = "results/DAG1_ortholog_correlations/"
html_output = "results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [4]:
# # Uncomment for running interactive
# HEK293T_data_path = "../results/func_effects/averages/293T_entry_func_effects.csv"
# humanDAG1_data_path = "../results/func_effects/averages/human_293T_entry_func_effects.csv"
# mastomysDAG1_data_path = "../results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
# # Minimum times seen filter
# MTS = 2
# n_selections = 8

# html_dir = "../results/DAG1_ortholog_correlations/"
# html_output = "../results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [5]:
# Read data
hek_df = pd.read_csv(HEK293T_data_path)
human_df = pd.read_csv(humanDAG1_data_path)
mastomys_df = pd.read_csv(mastomysDAG1_data_path)

# Merge data on intersection of measured values
merged_df = (
    human_df.merge(
        mastomys_df,
        how="inner",
        on=["site", "wildtype", "mutant"],
        suffixes=["_human", "_mastomys"],
        validate="one_to_one",
    )
    .merge(
        hek_df,
        how="inner",
        on=["site", "wildtype", "mutant"],
        validate="one_to_one",
    )
)
merged_df = merged_df.rename(columns={
    "effect" : "effect_HEK293T", 
    "times_seen" : "times_seen_HEK293T", 
    "n_selections" : "n_selections_HEK293T", 
})

# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_human", "times_seen_mastomys", "times_seen_HEK293T"]].mean(axis=1)

# Filter for number of selections
merged_df = (
    merged_df.loc[
        (merged_df["n_selections_human"] == n_selections)
        &
        (merged_df["n_selections_mastomys"] == n_selections)
        &
        (merged_df["n_selections_HEK293T"] == n_selections)
    ]
)

merged_df
Out[5]:
site wildtype mutant effect_human times_seen_human n_selections_human effect_mastomys times_seen_mastomys n_selections_mastomys effect_HEK293T times_seen_HEK293T n_selections_HEK293T average_times_seen
0 1 M * -4.5400 39.620 8 -3.78400 40.120 8 -4.3130 38.500 8 39.413333
1 1 M A -3.6680 9.500 8 -3.41000 9.250 8 -4.2490 9.375 8 9.375000
2 1 M C -3.4210 15.120 8 -3.27100 15.380 8 -3.8490 14.000 8 14.833333
3 1 M D -3.5640 13.000 8 -3.34900 13.380 8 -4.0930 12.000 8 12.793333
4 1 M E -3.9870 6.125 8 -3.64400 5.750 8 -4.1490 6.750 8 6.208333
... ... ... ... ... ... ... ... ... ... ... ... ... ...
10345 491 R V -0.5255 7.250 8 -0.73620 7.125 8 -0.8019 7.125 8 7.166667
10346 491 R W -1.6390 9.000 8 -1.70400 8.750 8 -1.9180 8.500 8 8.750000
10347 491 R Y -1.1100 4.750 8 0.01844 5.000 8 -1.7270 4.750 8 4.833333
10348 492 * * 0.0000 NaN 8 0.00000 NaN 8 0.0000 NaN 8 NaN
10350 492 * L 0.2745 2.000 8 0.24450 2.000 8 0.4992 1.875 8 1.958333

9820 rows × 13 columns

Plot correlation of scores with an interactive plot

In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")

slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)

# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="functional score measured in 293T cells", 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T"
    ],
).properties(
    width=300,
    height=300
)

mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_mastomys",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="functional score measured in 293T cells", 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T",
    ],
).properties(
    width=300,
    height=300
)

mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_mastomys",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys"
    ],
).properties(
    width=300,
    height=300
)

corr_chart = alt.hconcat(
    human_vs_hek,
    mastomys_vs_hek,
    mastomys_vs_human, 
    spacing=5
).add_params(
   selector
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
)

# Make output dir if doesn't exist
if not os.path.exists(html_dir):
    os.mkdir(html_dir)

print(f"Saving to {html_output}")
corr_chart.save(html_output)

corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93
r^2 correlation human vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs HEK (min_times_seen=2): 0.93
r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs human (min_times_seen=2): 0.94
r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Saving to results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html
Out[6]:

Recreate same plot as above but reduce font sizes for a figure in a manuscript

In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")

slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)

# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="functional score measured in 293T cells", 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T"
    ],
).properties(
    width=115,
    height=115
)

mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_mastomys",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="functional score measured in 293T cells", 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T",
    ],
).properties(
    width=115,
    height=115
)

mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    alt.Y(
        "effect_mastomys",
        axis=alt.Axis(
            title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-6, -4, -2, 0, 2],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-6,2])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.15)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys"
    ],
).properties(
    width=115,
    height=115
)

corr_chart = alt.hconcat(
    human_vs_hek,
    mastomys_vs_hek,
    mastomys_vs_human, 
    spacing=5
).add_params(
   selector
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
).configure_point(
    size=10
)

corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93
r^2 correlation human vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs HEK (min_times_seen=2): 0.93
r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs human (min_times_seen=2): 0.94
r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Out[7]: